In this file I analyze the data I collected. Here's a bit of notation. For an assumed prime $p$, let $S(x)$ denote the partial sum of Legendre symbols, let $F(x)$ denote Polya's Fourier Expansion, and let $D(x) := S(x) - F(x)$ denote the difference. A "diff plot" plots $x$ vs. $D(x)$.
import os
import random
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
Column Description:
path = os.path.join("/Users", "jcheigh", 'Thesis')
plot_path = os.path.join(path, "plots")
csv_name = f"{path}/data.csv"
final_csv_name = f"{path}/final.csv"
df = pd.read_csv(csv_name)
get_percentage = lambda num, den : round(num / den * 100, 5)
get_distance = lambda x, y : round(abs(100 - x - y), 2)
df['max_error'] = df.apply(lambda row: max(row['pos_error'], -1 * row['neg_error']), axis = 1)
df['max_error_ind'] = np.where(df['max_error'] == df['pos_error'], 1, 0)
df["x_max_error"] = np.where(df["max_error_ind"] == 1, df["x_pos_error"], df["x_neg_error"])
df["mag_diff"] = df.apply(lambda row : abs(row["pos_error"] + row["neg_error"]), axis = 1)
df["mod_4"] = df["prime"].apply(lambda prime : prime % 4)
df["x_pos_error"] = np.vectorize(get_percentage)(df["x_pos_error"], df["prime"])
df["x_neg_error"] = np.vectorize(get_percentage)(df["x_neg_error"], df["prime"])
df["x_max_error"] = np.vectorize(get_percentage)(df["x_max_error"], df["prime"])
df["dist_check"] = np.vectorize(get_distance)(df["x_pos_error"], df["x_neg_error"])
df = df.sort_values("prime")
df_small_prime = df[df["prime"] < 200000]
df_small_pos_error = df[df['pos_error'] <= 50]
df_small_mag = df[df["mag_diff"] <= .17]
df_large_mag = df[df["mag_diff"] > .17]
df.to_csv(final_csv_name, index = False)
df
| prime | pos_error | neg_error | x_pos_error | x_neg_error | max_error | max_error_ind | x_max_error | mag_diff | mod_4 | dist_check | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 78 | 101359 | 27.71 | -17.25 | 46.34122 | 30.89908 | 27.71 | 1 | 46.34122 | 10.46 | 3 | 22.76 |
| 123 | 102653 | 23.70 | -23.62 | 50.34436 | 49.65271 | 23.70 | 1 | 50.34436 | 0.08 | 1 | 0.00 |
| 9 | 102859 | 24.54 | -22.80 | 14.94181 | 69.65069 | 24.54 | 1 | 14.94181 | 1.74 | 3 | 15.41 |
| 50 | 104479 | 17.74 | -27.27 | 58.53617 | 87.80329 | 27.27 | 0 | 87.80329 | 9.53 | 3 | 46.34 |
| 4 | 104759 | 51.11 | -0.00 | 52.31436 | 99.99809 | 51.11 | 1 | 52.31436 | 51.11 | 3 | 52.31 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 88 | 196579 | 30.61 | -33.39 | 48.28644 | 80.94812 | 33.39 | 0 | 80.94812 | 2.78 | 3 | 29.23 |
| 19 | 197551 | 28.33 | -33.93 | 44.84867 | 58.61474 | 33.93 | 0 | 58.61474 | 5.60 | 3 | 3.46 |
| 71 | 199499 | 46.06 | -17.20 | 49.99925 | 5.74790 | 46.06 | 1 | 49.99925 | 28.86 | 3 | 44.25 |
| 132 | 1000003 | 77.32 | -45.22 | 31.01061 | 48.52665 | 77.32 | 1 | 31.01061 | 32.10 | 3 | 20.46 |
| 131 | 1020389 | 80.51 | -80.51 | 24.99988 | 74.99983 | 80.51 | 1 | 24.99988 | 0.00 | 1 | 0.00 |
133 rows × 11 columns
intervals = [10000 * k for k in range(10)] + [100000 * k for k in range(1, 12)] + [float('inf')]
buckets = [(left, right) for left, right in zip(intervals, intervals[1:])]
def get_bucket(prime):
# can use binary search but doesn't matter
for (left, right) in buckets:
if left <= prime <= right:
return (left, right)
raise Exception("How did you get here....")
prime_bucket = df["prime"].apply(lambda prime : get_bucket(prime))
prime_vals = prime_bucket.value_counts()
empty_buckets = [bucket for bucket in buckets if bucket not in prime_vals.keys()]
print("=" * 40)
print(f"There are {len(df)} datapoints")
print("=" * 40)
for bucket, count in prime_vals.items():
ideal_count = 1 if bucket != (100000, 200000) else 150
remaining = max(ideal_count - count, 0)
print('-' * 30)
print(f"Bucket: {bucket}")
print(f"Current Count: {count}")
print(f"Ideal Count: {ideal_count}")
print(f"Remaining: {remaining}")
print('-' * 30)
print("Empty Buckets")
for bucket in empty_buckets:
print(bucket)
print("=" * 40)
======================================== There are 133 datapoints ======================================== ------------------------------ Bucket: (100000, 200000) Current Count: 131 Ideal Count: 150 Remaining: 19 ------------------------------ Bucket: (1000000, 1100000) Current Count: 2 Ideal Count: 1 Remaining: 0 ------------------------------ Empty Buckets (0, 10000) (10000, 20000) (20000, 30000) (30000, 40000) (40000, 50000) (50000, 60000) (60000, 70000) (70000, 80000) (80000, 90000) (90000, 100000) (200000, 300000) (300000, 400000) (400000, 500000) (500000, 600000) (600000, 700000) (700000, 800000) (800000, 900000) (900000, 1000000) (1100000, inf) ========================================
Helpers
names = {
"prime" : "Prime",
"pos_error" : "Max Pos. Error",
"neg_error" : "Max Neg. Error",
"x_pos_error" : "Index of Pos. Error (%)",
"x_neg_error" : "Index of Neg. Error (%)",
"max_error" : "Max Error Mag.",
"max_error_ind" : "Error Indicator",
"x_max_error" : "Index of Error Mag. (%)",
"mag_diff" : "Difference in Errors",
"dist_check" : "Distance Check",
"mod_4" : "Prime Mod 4",
"max_error_p" : "Max Error Mag. (%)",
"pos_error_p" : "Max Pos Error (%)",
"neg_error_p" : "Max Neg Error (%)"
}
def plot_diff_lst(prime: int):
plot_path = os.path.join("/Users", "jcheigh", "Thesis", "plots")
plot_path = f"{plot_path}/Prime = {prime} Difference Plot.png"
try:
img = Image.open(plot_path)
plt.figure(figsize = (20,10))
plt.imshow(img)
plt.axis('off') # Turn off axis
plt.show()
except Exception:
print(f"Prime = {prime} hasn't been analyzed yet")
categorical_features = ["max_error_ind", "mod_4"]
numerical_features = [col for col in df.columns if col not in categorical_features]
Plot Functionality:
def histplot(x, df = df, bins = 30, xlabel = None, title = None, savefig = False, savename = None):
# distribution of numerical
plt.figure(figsize = (10,8))
p = sns.histplot(x = x, data = df, bins = bins,
kde = True, fill = True,
edgecolor = "black", linewidth = 3
)
p.axes.lines[0].set_color("orange")
if xlabel is None:
xlabel = names[x]
if title is None:
title = f"{xlabel} Distribution"
plt.ylabel("Count", fontsize = 20)
plt.xlabel(xlabel, fontsize = 20)
plt.title(title, fontsize = 25)
if savefig:
path_to_save = f"{plot_path}/{savename}.png"
plt.savefig(path_to_save)
plt.show()
def countplot(x, df = df, xlabel = None, title = None, savefig = False, savename = None):
# distribution of categorical
plt.figure(figsize = (10,8))
p = sns.countplot(x = x, data = df,
order = df[x].value_counts().index)
for container in p.containers:
p.bar_label(container, label_type = "center", padding = 6, size = 15, color = "black",
bbox={"boxstyle": "round", "pad": 0.4,"facecolor": "#e0b583",
"edgecolor": "#1c1c1c", "linewidth" : 4, "alpha": 1})
if xlabel is None:
xlabel = names[x]
if title is None:
title = f"Breakdown of {xlabel}"
p.axes.set_title(title, fontsize = 25)
p.axes.set_ylabel("Count",fontsize = 20)
p.axes.set_xlabel(xlabel, fontsize = 20)
if savefig:
path_to_save = f"{plot_path}/{savename}.png"
plt.savefig(path_to_save)
plt.show()
def violinplot(x, y = None, df = df, xlabel = None, ylabel = None, title = None, savefig = False, savename = None):
# x is categorical, y is numerical
plt.figure(figsize = (10, 8))
if y is None:
y = "prime"
p = sns.violinplot(x = x, y = y, data = df,
order = df[x].value_counts().index,
linewidth = 3, edgecolor = "black")
if xlabel is None:
xlabel = names[x]
if ylabel is None:
ylabel = names[y]
if title is None:
title = f"{ylabel} by {xlabel}"
p.axes.set_title(title, fontsize = 25)
p.axes.set_xlabel(xlabel, fontsize = 20)
p.axes.set_ylabel(ylabel, fontsize = 20)
if savefig:
path_to_save = f"{plot_path}/{savename}.png"
plt.savefig(path_to_save)
plt.show()
def kdeplot(x, y = None, data = df, xlabel = None, ylabel = None, title = None, savefig = False, savename = None):
# both numeric
plt.figure(figsize = (10,8))
if y is None:
y = "prime"
p = sns.kdeplot(x = x, y = y, data = df, fill = True)
if xlabel is None:
xlabel = names[x]
if ylabel is None:
ylabel = names[y]
if title is None:
title = f"{ylabel} by {xlabel}"
p.axes.set_title(title ,fontsize=25)
plt.ylabel(ylabel, fontsize = 20)
plt.xlabel(xlabel, fontsize = 20)
if savefig:
path_to_save = f"{plot_path}/{savename}.png"
plt.savefig(path_to_save)
plt.show()
def scatter(y, x = None, df = df, y_trans = "None", xlabel = None, ylabel = None, title = None, savefig = False, savename = None):
# both numeric
plt.figure(figsize = (10, 8))
sns.set(style='ticks')
sns.set_context("poster")
if x is None:
x = "prime"
if xlabel is None:
xlabel = names[x]
if ylabel is None:
ylabel = names[y]
if title is None:
title = f"{ylabel} by {xlabel} with Transformation = {y_trans.title()}"
y = df[y]
x = df[x]
if y_trans == "log":
y = np.log(y)
if y_trans == 'log_squared':
y = np.square(np.log(y))
p = sns.regplot(x = x, y = y)
p.axes.set_title(title, fontsize=25)
plt.ylabel(ylabel, fontsize = 20)
plt.xlabel(xlabel, fontsize = 20)
if savefig:
path_to_save = f"{plot_path}/{savename}.png"
plt.savefig(path_to_save)
plt.show()
Basic Descriptive Stats
df.head()
| prime | pos_error | neg_error | x_pos_error | x_neg_error | max_error | max_error_ind | x_max_error | mag_diff | mod_4 | dist_check | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 78 | 101359 | 27.71 | -17.25 | 46.34122 | 30.89908 | 27.71 | 1 | 46.34122 | 10.46 | 3 | 22.76 |
| 123 | 102653 | 23.70 | -23.62 | 50.34436 | 49.65271 | 23.70 | 1 | 50.34436 | 0.08 | 1 | 0.00 |
| 9 | 102859 | 24.54 | -22.80 | 14.94181 | 69.65069 | 24.54 | 1 | 14.94181 | 1.74 | 3 | 15.41 |
| 50 | 104479 | 17.74 | -27.27 | 58.53617 | 87.80329 | 27.27 | 0 | 87.80329 | 9.53 | 3 | 46.34 |
| 4 | 104759 | 51.11 | -0.00 | 52.31436 | 99.99809 | 51.11 | 1 | 52.31436 | 51.11 | 3 | 52.31 |
df.tail()
| prime | pos_error | neg_error | x_pos_error | x_neg_error | max_error | max_error_ind | x_max_error | mag_diff | mod_4 | dist_check | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 88 | 196579 | 30.61 | -33.39 | 48.29 | 80.95 | 33.39 | 0 | 80.95 | 2.78 | 3 | 29.24 |
| 19 | 197551 | 28.33 | -33.93 | 44.85 | 58.61 | 33.93 | 0 | 58.61 | 5.60 | 3 | 3.46 |
| 71 | 199499 | 46.06 | -17.20 | 50.00 | 5.75 | 46.06 | 1 | 50.00 | 28.86 | 3 | 44.25 |
| 132 | 1000003 | 77.32 | -45.22 | 31.01 | 48.53 | 77.32 | 1 | 31.01 | 32.10 | 3 | 20.46 |
| 131 | 1020389 | 80.51 | -80.51 | 25.00 | 75.00 | 80.51 | 1 | 25.00 | 0.00 | 1 | 0.00 |
df.describe()
| prime | pos_error | neg_error | x_pos_error | x_neg_error | max_error | max_error_ind | x_max_error | mag_diff | mod_4 | dist_check | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.330000e+02 | 133.00000 | 133.000000 | 133.000000 | 133.000000 | 133.000000 | 133.000000 | 133.000000 | 133.000000 | 133.000000 | 133.000000 |
| mean | 1.532672e+05 | 32.17406 | -23.993383 | 48.656466 | 57.398647 | 33.040226 | 0.684211 | 48.005188 | 9.913008 | 2.037594 | 17.341880 |
| std | 1.099918e+05 | 12.29249 | 10.201091 | 29.295407 | 29.467138 | 11.849783 | 0.466587 | 28.769678 | 17.170020 | 1.003071 | 23.511079 |
| min | 1.013590e+05 | 10.21000 | -80.510000 | 0.060000 | 0.500000 | 21.350000 | 0.000000 | 0.060000 | 0.000000 | 1.000000 | 0.000000 |
| 25% | 1.158770e+05 | 25.40000 | -28.780000 | 28.470000 | 34.540000 | 26.290000 | 0.000000 | 26.780000 | 0.030000 | 1.000000 | 0.000000 |
| 50% | 1.330730e+05 | 28.05000 | -25.200000 | 44.850000 | 59.640000 | 28.920000 | 1.000000 | 44.690000 | 0.430000 | 3.000000 | 0.280000 |
| 75% | 1.658870e+05 | 34.93000 | -22.030000 | 71.680000 | 82.770000 | 35.360000 | 1.000000 | 68.280000 | 12.140000 | 3.000000 | 33.330000 |
| max | 1.020389e+06 | 96.08000 | -0.000000 | 99.980000 | 100.000000 | 96.080000 | 1.000000 | 99.750000 | 96.080000 | 3.000000 | 87.980000 |
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 133 entries, 78 to 131 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 prime 133 non-null int64 1 pos_error 133 non-null float64 2 neg_error 133 non-null float64 3 x_pos_error 133 non-null float64 4 x_neg_error 133 non-null float64 5 max_error 133 non-null float64 6 max_error_ind 133 non-null int64 7 x_max_error 133 non-null float64 8 mag_diff 133 non-null float64 9 mod_4 133 non-null int64 10 dist_check 133 non-null float64 dtypes: float64(8), int64(3) memory usage: 12.5 KB
Example Diff Plots
prime_sample = random.choices(df["prime"].values, k= 4)
for prime in prime_sample:
plot_diff_lst(prime)
Here we see an important trend. All the diff plots look more or less the same (seemingly random Brownian Motion like stuff) with a few important properties:
I will explore both of these thoroughly later
histplot("prime")
histplot("prime", df = df_small_prime)
Prime Distribution:
histplot("pos_error")
for prime in df[df["pos_error"] > 70]['prime'].values:
plot_diff_lst(prime)
These are all either 1 million primes or just super translated up
Max Positive Error Distribution:
print(f"Max Positive Error Mean = {round(df_small_prime['pos_error'].mean(), 2)}")
print(f"Max Positive Error Std. Dev. = {round(df_small_prime['pos_error'].std(), 2)}")
Max Positive Error Mean = 31.46 Max Positive Error Std. Dev. = 10.92
Mean is 31.46 and standard deviation is 10.92. Given < 150 datapoints this is definitely not normal as the observations in the 60s to 100 wouldn't be possible.
histplot("pos_error", df = df_small_pos_error)
Let's investigate this outlier with very low max positive erorr
df[df["pos_error"] < 15]
| prime | pos_error | neg_error | x_pos_error | x_neg_error | max_error | max_error_ind | x_max_error | mag_diff | |
|---|---|---|---|---|---|---|---|---|---|
| 55 | 137831 | 10.21 | -41.56 | 137797 | 67410 | 41.56 | 0 | 67410 | 31.35 |
plot_diff_lst(137831)
Here we see that this is a weird example where the diff plot looks like it was translated down.
scatter("pos_error", title = "Max Positive Error vs. Prime")
scatter("pos_error", df = df_small_prime)
There seems to be a positive correlation between prime and max positive error (the confidence interval around the least squares line visibly doesn't include a line with 0 slope, which is indicative of the $F$ statistic). However, it's obviously more complex than being directly a function of the prime
histplot('neg_error')
Bears some resemblance to pos error except obviously flipped. Here the -80 outlier is due to the 1 million point so not as many outliers on the negative side.
histplot("neg_error", df = df_small_prime)
But, there are some points with virtually no errors that were negative. Let's look at the diff plots for these
df_pos = df[df["neg_error"] > -5]
df_pos
| prime | pos_error | neg_error | x_pos_error | x_neg_error | max_error | max_error_ind | x_max_error | mag_diff | |
|---|---|---|---|---|---|---|---|---|---|
| 4 | 104759 | 51.11 | -0.00 | 54804 | 104757 | 51.11 | 1 | 54804 | 51.11 |
| 90 | 110543 | 42.79 | -0.73 | 28393 | 36066 | 42.79 | 1 | 28393 | 42.06 |
| 52 | 113279 | 56.49 | -0.00 | 50628 | 113277 | 56.49 | 1 | 50628 | 56.49 |
| 62 | 117191 | 53.42 | -0.00 | 47164 | 117189 | 53.42 | 1 | 47164 | 53.42 |
| 113 | 120671 | 44.90 | -1.03 | 115994 | 108589 | 44.90 | 1 | 115994 | 43.87 |
| 3 | 128399 | 64.16 | -0.00 | 71053 | 128397 | 64.16 | 1 | 71053 | 64.16 |
| 48 | 154279 | 58.56 | -0.00 | 51425 | 154277 | 58.56 | 1 | 51425 | 58.56 |
| 80 | 168719 | 78.03 | -0.00 | 34053 | 168717 | 78.03 | 1 | 34053 | 78.03 |
| 35 | 185303 | 96.08 | -0.00 | 111180 | 185301 | 96.08 | 1 | 111180 | 96.08 |
for prime in df_pos["prime"].values:
plot_diff_lst(prime)
These are just translated really high up. The entire plot is basically above the log error threshold. However, they are still relatively bounded, meaning any reasonable big O constant would still make the error term here $O(\log p)$.
histplot("prime", df = df_pos)
This is similar to the initial distribution of prime, so no trend like higher primes are more likely (at least in this range)
df_pos
| prime | pos_error | neg_error | x_pos_error | x_neg_error | max_error | max_error_ind | x_max_error | mag_diff | |
|---|---|---|---|---|---|---|---|---|---|
| 4 | 104759 | 51.11 | -0.00 | 54804 | 104757 | 51.11 | 1 | 54804 | 51.11 |
| 90 | 110543 | 42.79 | -0.73 | 28393 | 36066 | 42.79 | 1 | 28393 | 42.06 |
| 52 | 113279 | 56.49 | -0.00 | 50628 | 113277 | 56.49 | 1 | 50628 | 56.49 |
| 62 | 117191 | 53.42 | -0.00 | 47164 | 117189 | 53.42 | 1 | 47164 | 53.42 |
| 113 | 120671 | 44.90 | -1.03 | 115994 | 108589 | 44.90 | 1 | 115994 | 43.87 |
| 3 | 128399 | 64.16 | -0.00 | 71053 | 128397 | 64.16 | 1 | 71053 | 64.16 |
| 48 | 154279 | 58.56 | -0.00 | 51425 | 154277 | 58.56 | 1 | 51425 | 58.56 |
| 80 | 168719 | 78.03 | -0.00 | 34053 | 168717 | 78.03 | 1 | 34053 | 78.03 |
| 35 | 185303 | 96.08 | -0.00 | 111180 | 185301 | 96.08 | 1 | 111180 | 96.08 |
df[df["pos_error"] >= 50]
| prime | pos_error | neg_error | x_pos_error | x_neg_error | max_error | max_error_ind | x_max_error | mag_diff | |
|---|---|---|---|---|---|---|---|---|---|
| 4 | 104759 | 51.11 | -0.00 | 54804 | 104757 | 51.11 | 1 | 54804 | 51.11 |
| 52 | 113279 | 56.49 | -0.00 | 50628 | 113277 | 56.49 | 1 | 50628 | 56.49 |
| 62 | 117191 | 53.42 | -0.00 | 47164 | 117189 | 53.42 | 1 | 47164 | 53.42 |
| 3 | 128399 | 64.16 | -0.00 | 71053 | 128397 | 64.16 | 1 | 71053 | 64.16 |
| 48 | 154279 | 58.56 | -0.00 | 51425 | 154277 | 58.56 | 1 | 51425 | 58.56 |
| 80 | 168719 | 78.03 | -0.00 | 34053 | 168717 | 78.03 | 1 | 34053 | 78.03 |
| 97 | 176899 | 50.33 | -32.40 | 70758 | 141518 | 50.33 | 1 | 70758 | 17.93 |
| 35 | 185303 | 96.08 | -0.00 | 111180 | 185301 | 96.08 | 1 | 111180 | 96.08 |
| 131 | 1020389 | 80.51 | -80.51 | 255096 | 765290 | 80.51 | 1 | 255096 | 0.00 |
Here I'm checking the overlap between the most positive error and the least negative error.
plot_diff_lst(1020389)
This 1 million datapoint is weird. As opposed to other plots, it exceeds the log error on both sides. Though not by much, this is telling me that maybe we haven't converged to any "final" behavior. In other words, log error looks good for now but the fact that this graph is qualitatively different than that of the other graphs means that it could be dangerous to extrapolate any findings from this dataset
plot_diff_lst(1000003)
Yup, this one shows the same property. This is an example of a translated up graph.
plot_diff_lst(176899)
Looking at the plot more seriosly we just have a "normal" plot with a few major exceptions, in both the positive and negative direction. Another very interesting thing I see here is symmetry. This is something to definitely explore later.
scatter("neg_error", title = "Max Negative Error vs. Prime")
scatter("neg_error", df = df_small_prime, title = "Max Negative Error vs. Prime")
This has a more straightforward decreasing relationship than the positive
histplot("x_pos_error")
Lot virtually at beginning or virtually at end, and then besides that mostly centered
scatter("x_pos_error", df = df_small_prime, title = "x_pos_error vs. prime")
Basically no trend here, meaning that prime has no impact on where (relative to the prime) the index of the pos error occurs.
histplot("x_neg_error")
Here maybe slightly increasing and then huge bump at around 100: a lot of reach max negative error towards the end. Let's look at these.
df[df["x_neg_error"] > 95]
| prime | pos_error | neg_error | x_pos_error | x_neg_error | max_error | max_error_ind | x_max_error | mag_diff | mod_4 | dist_check | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 104759 | 51.11 | -0.00 | 52.31 | 100.00 | 51.11 | 1 | 52.31 | 51.11 | 3 | 52.31 |
| 57 | 109139 | 38.13 | -9.57 | 38.88 | 97.84 | 38.13 | 1 | 38.88 | 28.56 | 3 | 36.72 |
| 52 | 113279 | 56.49 | -0.00 | 44.69 | 100.00 | 56.49 | 1 | 44.69 | 56.49 | 3 | 44.69 |
| 128 | 113749 | 27.86 | -27.84 | 0.22 | 99.78 | 27.86 | 1 | 0.22 | 0.02 | 1 | 0.00 |
| 104 | 114643 | 20.27 | -26.29 | 0.73 | 99.04 | 26.29 | 0 | 99.04 | 6.02 | 3 | 0.23 |
| 62 | 117191 | 53.42 | -0.00 | 40.25 | 100.00 | 53.42 | 1 | 40.25 | 53.42 | 3 | 40.25 |
| 43 | 118147 | 24.68 | -23.11 | 80.44 | 96.54 | 24.68 | 1 | 80.44 | 1.57 | 3 | 76.98 |
| 3 | 128399 | 64.16 | -0.00 | 55.34 | 100.00 | 64.16 | 1 | 55.34 | 64.16 | 3 | 55.34 |
| 79 | 136943 | 27.54 | -27.23 | 1.12 | 98.69 | 27.54 | 1 | 1.12 | 0.31 | 3 | 0.19 |
| 73 | 147517 | 27.10 | -27.08 | 3.12 | 96.88 | 27.10 | 1 | 3.12 | 0.02 | 1 | 0.00 |
| 48 | 154279 | 58.56 | -0.00 | 33.33 | 100.00 | 58.56 | 1 | 33.33 | 58.56 | 3 | 33.33 |
| 85 | 163621 | 37.71 | -37.71 | 0.16 | 99.84 | 37.71 | 1 | 0.16 | 0.00 | 1 | 0.00 |
| 80 | 168719 | 78.03 | -0.00 | 20.18 | 100.00 | 78.03 | 1 | 20.18 | 78.03 | 3 | 20.18 |
| 35 | 185303 | 96.08 | -0.00 | 60.00 | 100.00 | 96.08 | 1 | 60.00 | 96.08 | 3 | 60.00 |
| 114 | 192347 | 36.32 | -28.70 | 83.37 | 96.95 | 36.32 | 1 | 83.37 | 7.62 | 3 | 80.32 |
| 25 | 192377 | 32.28 | -32.31 | 3.15 | 96.85 | 32.31 | 0 | 96.85 | 0.03 | 1 | 0.00 |
Some, like 104759, 113279, 117191, 168719, and 185303, reach it exactly at 100%. This is because the max negative error is 0 (these are those weird plots that are super translated up). Let's look at the others
for prime in df[(df["x_neg_error"] > 95) & (df["x_neg_error"] < 100)]["prime"].values:
plot_diff_lst(prime)
Prime = 118147 hasn't been analyzed yet
I have no idea why I don't have the plot for p = 118147. Based on the others it seems that these plots have huge spikes right at the tails
df[df["prime"] == 118147]
| prime | pos_error | neg_error | x_pos_error | x_neg_error | max_error | max_error_ind | x_max_error | mag_diff | |
|---|---|---|---|---|---|---|---|---|---|
| 43 | 118147 | 24.68 | -23.11 | 80.44 | 96.54 | 24.68 | 1 | 80.44 | 1.57 |
plot_diff_lst(109139)
scatter('x_neg_error', df = df_small_prime, title = "x_neg_error vs. Prime")
Again no real relationship here
histplot("max_error")
histplot('max_error', df = df_small_prime)
Looks pretty similar to positive error.
scatter("max_error", df = df_small_prime, title = "Max Error Magnitude vs. Prime", savefig = True, savename = "Max Error Mag. vs. Prime")
There is a positive relationship between prime and max error. We also see that the largest max error outliers occur with larger primes.
countplot("max_error_ind", savefig = True, savename = "Max Error Indicator")
Around 2/3 are error indicator == 1.
violinplot("max_error_ind", "prime")
violinplot("max_error_ind", "prime", df = df_small_prime)
Nothing really obvious here
histplot("x_max_error")
scatter("x_max_error", title = "Index of Max Error vs. Prime", df = df_small_prime, savefig = True, savename = "x_max_error vs. prime")
All virtually the same as positive error here.
histplot("mag_diff", savefig = True, savename = "mag_diff distribution")
Interestingly, a vast majority are very close to 0, and then the rest trail off. It's almost as if it's a binary 0 or not 0, and then for those that are not 0 there's a separate distribution
get_length = lambda cutoff : len(df[df["mag_diff"] < cutoff])
for cutoff in [.01, .05, .08, 1, 2, 5, 10]:
print(f"Cutoff: {round(cutoff, 2)}, Length: {get_length(cutoff)}")
Cutoff: 0.01, Length: 9 Cutoff: 0.05, Length: 52 Cutoff: 0.08, Length: 63 Cutoff: 1, Length: 70 Cutoff: 2, Length: 73 Cutoff: 5, Length: 82 Cutoff: 10, Length: 93
Wow, almost half of the points have a magnitude difference within .08
histplot('mag_diff', df = df_small_mag)
df[df["mag_diff"] < .08].sort_values("mag_diff")
| prime | pos_error | neg_error | x_pos_error | x_neg_error | max_error | max_error_ind | x_max_error | mag_diff | mod_4 | dist_check | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 131 | 1020389 | 80.51 | -80.51 | 25.00 | 75.00 | 80.51 | 1 | 25.00 | 0.00 | 1 | 0.00 |
| 27 | 105037 | 24.32 | -24.32 | 23.97 | 76.03 | 24.32 | 1 | 23.97 | 0.00 | 1 | 0.00 |
| 122 | 137197 | 27.51 | -27.51 | 5.80 | 94.19 | 27.51 | 1 | 5.80 | 0.00 | 1 | 0.01 |
| 85 | 163621 | 37.71 | -37.71 | 0.16 | 99.84 | 37.71 | 1 | 0.16 | 0.00 | 1 | 0.00 |
| 82 | 111653 | 23.72 | -23.72 | 96.47 | 3.52 | 23.72 | 1 | 96.47 | 0.00 | 1 | 0.01 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 46 | 105601 | 26.28 | -26.22 | 67.02 | 32.98 | 26.28 | 1 | 67.02 | 0.06 | 1 | 0.00 |
| 24 | 173209 | 32.69 | -32.76 | 65.46 | 34.54 | 32.76 | 0 | 34.54 | 0.07 | 1 | 0.00 |
| 127 | 117709 | 23.62 | -23.55 | 17.22 | 82.77 | 23.62 | 1 | 17.22 | 0.07 | 1 | 0.01 |
| 33 | 129733 | 24.59 | -24.66 | 60.63 | 39.37 | 24.66 | 0 | 39.37 | 0.07 | 1 | 0.00 |
| 123 | 102653 | 23.70 | -23.62 | 50.34 | 49.65 | 23.70 | 1 | 50.34 | 0.08 | 1 | 0.01 |
63 rows × 11 columns
Here we see something really interesting: points with very small mag_diff have very small dist_check as well
histplot('prime', df = df[(df["mag_diff"] < 1) & (df["prime"] < 200000)])
Seems pretty random when this phenomena occurs.
histplot('dist_check')
This distribution is very similar to that of mag_diff.
histplot("dist_check", df = df_small_mag)
Here we see this phenomena: those with small mag diff almost always have small dist check. Let's look at some of these primes
for prime in random.choices(df_small_mag["prime"].values, k = 3):
plot_diff_lst(prime)
Basically these have some sort of antisymmetry, where the index of the highest peak, reflected across the middle, is the index of the lowest valley. Further, the magnitude of the peak and valley are basically the same
df[df["mag_diff"] < .08]["dist_check"].sort_values()
66 0.00
54 0.00
94 0.00
110 0.00
117 0.00
...
122 0.01
130 0.01
82 0.01
123 0.01
44 8.78
Name: dist_check, Length: 63, dtype: float64
df[(df["mag_diff"] < .08) & (df["dist_check"] == 8.78)]
| prime | pos_error | neg_error | x_pos_error | x_neg_error | max_error | max_error_ind | x_max_error | mag_diff | dist_check | |
|---|---|---|---|---|---|---|---|---|---|---|
| 44 | 105097 | 21.34 | -21.35 | 79.66 | 29.12 | 21.35 | 0 | 29.12 | 0.01 | 8.78 |
histplot('dist_check', df = df[df["mag_diff"] <= .17], title = "Distance Check when Mag Diff <= .17", savefig = True, savename = "Dist Check")
data = pd.DataFrame({
'Mag Diff Ind' : df["mag_diff"].apply(lambda diff : "<= .17" if diff <= .17 else "> .17"),
'Prime Mod 4' : df["mod_4"]
})
contingency_table = pd.crosstab(data["Prime Mod 4"], data["Mag Diff Ind"])
contingency_table
| Mag Diff Ind | <= .17 | > .17 |
|---|---|---|
| Prime Mod 4 | ||
| 1 | 64 | 0 |
| 3 | 0 | 69 |
sns.set_style("whitegrid")
# Create a heatmap of the contingency table (excluding the margins for better visualization)
heatmap_data = contingency_table
plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_data, annot=True, cmap="YlGnBu", fmt="d", linewidths=.5, cbar = False)
plt.title("Frequency Table")
path_to_save = f"{plot_path}/Frequency Table.png"
plt.savefig(path_to_save)
plt.show()
plot_diff_lst(105097)
This one you can tell has two peaks and two valleys that are pretty much the same. So if the peaks are p1, p2 and the valleys are v1, v2, usually we get pi matches with vi, but here we have pi matching with vj
for prime in random.choices(df[df["mod_4"] == 3]["prime"].values, k = 10):
plot_diff_lst(prime)
Ok I later found something probably trivial but really cool: it seems that all primes that are congruent to 1 modulo 4 have some form of antisymmetry. By that I mean that flipping across the horizontal axis and then across the vertical axis makes the diff plot roughly the same (i.e. if you rotate the plot 180 degrees it looks kinda similar).
On the other hand, all primes that are congruent to 3 modulo 4 seem to have some form of symmetry. By that I mean flipping across the vertical axis seems to retain the diff plot
Interestingly, it's not exact, but it's approximately correct every time
plt.figure(figsize = (20,20))
sns.heatmap(df.corr(), square = True, annot = True,
vmin = -1, vmax = 1, linewidths = 3)
plt.title("Correlation Map\n", fontsize = 25)
plt.xticks(rotation = 90)
plt.yticks(rotation = 0)
plt.show()
Honestly this is not a super relevant plot because all of this is basically feature engineering (with a lot of non linear stuff like max/min), which means looking at correlation is not entirely relevant.
What is interesting though is seeing if more patterns emerge by looking at primes that are or aren't congruent to 1 mod 4.
countplot('mod_4')
df_1 = df[df["mod_4"] == 1]
df_3 = df[df["mod_4"] == 3]
histplot("neg_error", df= df_1[df_1['neg_error'] > -50])
histplot("neg_error", df= df_3)
Here we see something interesting: only primes congruent to 3 mod 4 are the 0 max neg error property (recall this is a property related to translation). Maybe there's a correlation between translation and mod_4.
Ok, I just plotted each prime that 3 mod 4 and 1 mod 4 and here are my observations:
for prime in random.choices(df_1['prime'].values, k = 4):
plot_diff_lst(prime)
Notice every diff plot is antisymmetric and centered around 0!
for prime in random.choices(df_3['prime'].values, k = 3):
plot_diff_lst(prime)
Notice these are all symmetric and may or may not be all translated. Here's an example of one that isn't translated
plot_diff_lst(111103)
Now let's actually analyze the difference lists itself
lst_path = os.path.join("/Users", "jcheigh", "Thesis", 'data')
get_lst_path = lambda prime : f'{lst_path}/Prime = {prime} Diff List.txt'
def get_lst(prime):
path = get_lst_path(prime)
with open(path, "r") as txt_file:
lst = txt_file.read().splitlines()
return [float(x) for x in lst]
large_prime_1 = get_lst(1020389)
large_prime_3 = get_lst(1000003)
small_prime_1 = get_lst(129749)
small_prime_3 = get_lst(115151)
Given the no translation property, we would expect large_prime_1 and small_prime_1 to have means around 0
print(f"Mean of Large Prime 1 Diff List: {round(np.mean(large_prime_1), 10)}")
print(f"Mean of Small Prime 1 Diff List: {round(np.mean(small_prime_1), 10)}")
Mean of Large Prime 1 Diff List: 0.0 Mean of Small Prime 1 Diff List: -0.0
They're exactly 0!
print(f"Mean of Large Prime 3 Diff List: {round(np.mean(large_prime_3), 10)}")
print(f"Mean of Small Prime 3 Diff List: {round(np.mean(small_prime_3), 10)}")
Mean of Large Prime 3 Diff List: 13.5065499004 Mean of Small Prime 3 Diff List: 4.1492537625
This also aligns; they're not completely 0.
We also expect large_prime_1 and small_prime_1 to be antisymmetric. By this we mean data(i) ~ -1 * data(-i).
def plot_check(lst, name, xlabel):
plt.figure(figsize = (10,8))
p = sns.histplot(lst, bins = 30,
kde = True, fill = True,
edgecolor = 'black', linewidth = 3
)
p.axes.lines[0].set_color("orange")
plt.title(name)
plt.xlabel(xlabel)
path_to_save = f"{plot_path}/{name}.png"
plt.savefig(path_to_save)
plt.show()
antisymmetry_check = lambda lst : [lst[i] + lst[-i] for i in range(len(lst) // 2)]
symmetry_check = lambda lst : [lst[i] - lst[-i] for i in range(len(lst) // 2)]
def plot_antisym_check(lst, name):
plt.figure(figsize = (10,8))
lst = antisymmetry_check(lst)
plot_check(lst, name, "D(x) + D(-x)")
def plot_sym_check(lst, name):
plt.figure(figsize = (10,8))
lst = symmetry_check(lst)
plot_check(lst, name, "D(x) - D(-x)")
plot_antisym_check(large_prime_1, "AntiSym Check: Large Prime 1")
<Figure size 720x576 with 0 Axes>
plot_antisym_check(small_prime_1, "AntiSym Check: Small Prime 1")
<Figure size 720x576 with 0 Axes>
These are almost identical, weird
plot_sym_check(large_prime_1, "Sym Check: Large Prime 1")
<Figure size 720x576 with 0 Axes>
plot_sym_check(small_prime_1, "Sym Check: Small Prime 1")
<Figure size 720x576 with 0 Axes>
And these are roughly normal, double weird
plot_sym_check(large_prime_3, "Sym Check: Large Prime 3")
<Figure size 720x576 with 0 Axes>
plot_sym_check(small_prime_3, "Sym Check: Small Prime 3")
<Figure size 720x576 with 0 Axes>
plot_antisym_check(large_prime_3, "AntiSym Check: Large Prime 3")
<Figure size 720x576 with 0 Axes>
plot_antisym_check(small_prime_3, "AntiSym Check: Small Prime 3")
<Figure size 720x576 with 0 Axes>